Machine Learning in Python


faqs


setup

pip install scikit-learn
import sklearn
sklearn.__version__
#  '0.23.2'

basics


preprocessing

StandardScaler

from sklearn.preprocessing import StandardScaler
scl=StandardScaler(with_mean=True, with_std=True)
scl.fit(X)
X_scaled = scl.transform(X)

PolynomialFeatures

from sklearn.preprocessing import PolynomialFeatures
pr=PolynomialFeatures(degree=3, include_bias=True)
pr.fit_transform(X[['x1']])

linear_model

LinearRegression

from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, Y_train)
lr.intercept_
lr.coef_
lr.score(X_train, Y_train) # in-sample R^2
Yhat_test = lr.predict(X_test)

Ridge

from sklearn.linear_model import Ridge
rr = Ridge(alpha=0.1)
rr.fit(X_train, Y_train)
yhat_pred = rr.predict(X_test)

metrics

mean_squared_error

from sklearn.metrics import mean_squared_error
mean_squared_error(Y_test, Yhat_test)

model_selection

train_test_split

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

cross_val_score

returns the evaluation metric for each of the k test folds

from sklearn.model_selection import cross_val_score
# e.g. R^2 by default for linear model?
test_scores = cross_val_score(lr, X_train, Y_train, cv=5)

cross_val_predict

returns the predictions for each observation when it was in the test fold

from sklearn.model_selection import cross_val_predict
test_preds = cross_val_predict(lr, X_train, Y_train, cv=5)

GridSearchCV

from sklearn.model_selection import GridSearchCV
rr = Ridge()
params = [
  dict(alpha = [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]),
  dict(normalize = [True, False])
]
grid = GridSearchCV(rr, params, cv=5)
grid.fit(X_train, Y_train)
grid.best_estimator_
scores = grid.cv_results_
scores['mean_test_score']

pipeline

from sklearn.pipeline import Pipeline
Input=[
  ('scale', StandardScaler()),
  ('poly3', PolynomialFeatures(degree=3, include_bias=True)),
  ('model', LinearRegression())
]
pipe=Pipeline(Input)
pipe.fit(X_train, Y_train)
yhat_test = pipe.predict(X_test)